require(quanteda)
require(LSX)
source("functions.R")

dict <- dictionary(file = 'historical.yml')

dat_nyt <- readRDS("data/data_nytimes_fulltext.RDS")
dat_nyt$year <- lubridate::year(dat_nyt$date)

corp <- corpus(dat_nyt, text_field = "body")
lss <- readRDS("lss.RDS")
toks <- tokens(corp, remove_punct = TRUE) %>% 
    tokens_remove(c(month, day, title)) %>% 
    tokens_remove(stopwords("en"), min_nchar = 2) %>% 
    tokens_select("^[a-z0-9]+$", valuetype = "regex") %>% 
    tokens_remove("^[0-9]+$", valuetype = "regex")

toks_chunk <- tokens_chunk(toks, size = 20, use_docvars = TRUE)
dfmt_chunk <- dfm(toks_chunk)
dat <- docvars(dfmt_chunk)
dat$position <- (dfmt_chunk@docvars[["segid_"]] - 1) * 20
dat_agg <- aggregate(list(n = rep(1, ndoc(toks))),
                     by = list(position = floor(ntoken(toks) / 20) * 20),
                     FUN = sum)

dat$lss <- predict(lss, newdata = dfmt_chunk)
dat$country <- ntoken(tokens_lookup(toks_chunk, dictionary = dict))

dat_mean <- aggregate(list(lss = dat$lss, country = dat$country), 
                      by = list(position = dat$position),
                      FUN = mean, na.rm = TRUE)
dat_agg <- merge(dat_agg, dat_mean, by = "position", all.y = FALSE)

saveRDS(dat_agg, "data_fulltext.RDS")
